Astrid Wang
December 18, 2020
datasource: https://www.kaggle.com/mirichoi0218/insurance or https://gist.github.com/meperezcuello/82a9f1c1c473d6585e750ad2e3c05a41
The report paper including method discussion and reference is in a separate docs file
#installed.packages("car")
#install.packages("olsrr")
#install.packages("reshape2")
#install.packages("tidyverse")
#install.packages("caret")
#install.packages("MuMIn")
#library(kenlab)
library(MuMIn)
library(reshape2)
library(ggplot2)
library("olsrr")
##
## Attaching package: 'olsrr'
## The following object is masked from 'package:datasets':
##
## rivers
library(car)
## Loading required package: carData
library(caret)
## Loading required package: lattice
df<- read.csv(file = 'insurance2.csv')
head(df)
df <- transform(sex=factor(sex),smoker=factor(smoker),region=factor(region),df)
f1<-lm(charges~age+sex+bmi+children+smoker+region,data=df) # base model, or full model
summary(f1)
##
## Call:
## lm(formula = charges ~ age + sex + bmi + children + smoker +
## region, data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -11304.9 -2848.1 -982.1 1393.9 29992.8
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -11938.5 987.8 -12.086 < 2e-16 ***
## age 256.9 11.9 21.587 < 2e-16 ***
## sexmale -131.3 332.9 -0.394 0.693348
## bmi 339.2 28.6 11.860 < 2e-16 ***
## children 475.5 137.8 3.451 0.000577 ***
## smokeryes 23848.5 413.1 57.723 < 2e-16 ***
## regionnorthwest -353.0 476.3 -0.741 0.458769
## regionsoutheast -1035.0 478.7 -2.162 0.030782 *
## regionsouthwest -960.0 477.9 -2.009 0.044765 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6062 on 1329 degrees of freedom
## Multiple R-squared: 0.7509, Adjusted R-squared: 0.7494
## F-statistic: 500.8 on 8 and 1329 DF, p-value: < 2.2e-16
contrasts(df$sex)
## male
## female 0
## male 1
contrasts(df$smoker)
## yes
## no 0
## yes 1
contrasts(df$region)
## northwest southeast southwest
## northeast 0 0 0
## northwest 1 0 0
## southeast 0 1 0
## southwest 0 0 1
vif(f1)
## GVIF Df GVIF^(1/(2*Df))
## age 1.016822 1 1.008376
## sex 1.008900 1 1.004440
## bmi 1.106630 1 1.051965
## children 1.004011 1 1.002003
## smoker 1.012074 1 1.006019
## region 1.098893 3 1.015841
plot(density(df$charges))
plot(density(log(df$charges)))
plot(f1)
table(df$children) # try to understand the dataframe
##
## 0 1 2 3 4 5
## 574 324 240 157 25 18
plot(df$age,df$children)
plot(log(df$age),log(df$charges))
f2<-ols_step_backward_p(f1,details = TRUE)
## Backward Elimination Method
## ---------------------------
##
## Candidate Terms:
##
## 1 . age
## 2 . sex
## 3 . bmi
## 4 . children
## 5 . smoker
## 6 . region
##
## We are eliminating variables based on p value...
##
## - sex
##
## Backward Elimination: Step 1
##
## Variable sex Removed
##
## Model Summary
## --------------------------------------------------------------------
## R 0.867 RMSE 6060.178
## R-Squared 0.751 Coef. Var 45.667
## Adj. R-Squared 0.750 MSE 36725751.333
## Pred R-Squared 0.747 MAE 4171.710
## --------------------------------------------------------------------
## RMSE: Root Mean Square Error
## MSE: Mean Square Error
## MAE: Mean Absolute Error
##
## ANOVA
## -----------------------------------------------------------------------------------
## Sum of
## Squares DF Mean Square F Sig.
## -----------------------------------------------------------------------------------
## Regression 1.47229e+11 7 21032710327.911 572.697 0.0000
## Residual 48845249272.988 1330 36725751.333
## Total 196074221568.368 1337
## -----------------------------------------------------------------------------------
##
## Parameter Estimates
## ---------------------------------------------------------------------------------------------------------
## model Beta Std. Error Std. Beta t Sig lower upper
## ---------------------------------------------------------------------------------------------------------
## (Intercept) -11990.270 978.762 -12.250 0.000 -13910.355 -10070.185
## age 256.974 11.891 0.298 21.610 0.000 233.646 280.301
## bmi 338.665 28.559 0.171 11.858 0.000 282.639 394.690
## children 474.566 137.740 0.047 3.445 0.001 204.355 744.778
## smokeryes 23836.301 411.856 0.795 57.875 0.000 23028.341 24644.260
## regionnorthwest -352.182 476.120 -0.012 -0.740 0.460 -1286.211 581.847
## regionsoutheast -1034.360 478.537 -0.038 -2.162 0.031 -1973.130 -95.590
## regionsouthwest -959.375 477.778 -0.034 -2.008 0.045 -1896.656 -22.094
## ---------------------------------------------------------------------------------------------------------
##
##
##
## No more variables satisfy the condition of p value = 0.3
##
##
## Variables Removed:
##
## - sex
##
##
## Final Model Output
## ------------------
##
## Model Summary
## --------------------------------------------------------------------
## R 0.867 RMSE 6060.178
## R-Squared 0.751 Coef. Var 45.667
## Adj. R-Squared 0.750 MSE 36725751.333
## Pred R-Squared 0.747 MAE 4171.710
## --------------------------------------------------------------------
## RMSE: Root Mean Square Error
## MSE: Mean Square Error
## MAE: Mean Absolute Error
##
## ANOVA
## -----------------------------------------------------------------------------------
## Sum of
## Squares DF Mean Square F Sig.
## -----------------------------------------------------------------------------------
## Regression 1.47229e+11 7 21032710327.911 572.697 0.0000
## Residual 48845249272.988 1330 36725751.333
## Total 196074221568.368 1337
## -----------------------------------------------------------------------------------
##
## Parameter Estimates
## ---------------------------------------------------------------------------------------------------------
## model Beta Std. Error Std. Beta t Sig lower upper
## ---------------------------------------------------------------------------------------------------------
## (Intercept) -11990.270 978.762 -12.250 0.000 -13910.355 -10070.185
## age 256.974 11.891 0.298 21.610 0.000 233.646 280.301
## bmi 338.665 28.559 0.171 11.858 0.000 282.639 394.690
## children 474.566 137.740 0.047 3.445 0.001 204.355 744.778
## smokeryes 23836.301 411.856 0.795 57.875 0.000 23028.341 24644.260
## regionnorthwest -352.182 476.120 -0.012 -0.740 0.460 -1286.211 581.847
## regionsoutheast -1034.360 478.537 -0.038 -2.162 0.031 -1973.130 -95.590
## regionsouthwest -959.375 477.778 -0.034 -2.008 0.045 -1896.656 -22.094
## ---------------------------------------------------------------------------------------------------------
f2<-lm(charges ~ age + bmi + children + smoker+region, data = df)
summary(f2)
##
## Call:
## lm(formula = charges ~ age + bmi + children + smoker + region,
## data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -11367.2 -2835.4 -979.7 1361.9 29935.5
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -11990.27 978.76 -12.250 < 2e-16 ***
## age 256.97 11.89 21.610 < 2e-16 ***
## bmi 338.66 28.56 11.858 < 2e-16 ***
## children 474.57 137.74 3.445 0.000588 ***
## smokeryes 23836.30 411.86 57.875 < 2e-16 ***
## regionnorthwest -352.18 476.12 -0.740 0.459618
## regionsoutheast -1034.36 478.54 -2.162 0.030834 *
## regionsouthwest -959.37 477.78 -2.008 0.044846 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6060 on 1330 degrees of freedom
## Multiple R-squared: 0.7509, Adjusted R-squared: 0.7496
## F-statistic: 572.7 on 7 and 1330 DF, p-value: < 2.2e-16
summary(powerTransform(f2)) # try power transformations but failed
## bcPower Transformation to Normality
## Est Power Rounded Pwr Wald Lwr Bnd Wald Upr Bnd
## Y1 0.1507 0.15 0.1049 0.1966
##
## Likelihood ratio test that transformation parameter is equal to 0
## (log transformation)
## LRT df pval
## LR test, lambda = (0) 41.3912 1 1.2462e-10
##
## Likelihood ratio test that no transformation is needed
## LRT df pval
## LR test, lambda = (1) 1162.427 1 < 2.22e-16
invResPlot(f2)
f6<-lm(log(charges) ~ age + bmi + children + smoker+region, data = df) # use log instead of 1/3 power for the response variable because it makes more quantiles fashion the Normal distribution
summary(f6)
##
## Call:
## lm(formula = log(charges) ~ age + bmi + children + smoker + region,
## data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.10302 -0.19707 -0.05206 0.06564 2.15091
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.0008478 0.0719853 97.254 < 2e-16 ***
## age 0.0346490 0.0008746 39.618 < 2e-16 ***
## bmi 0.0130711 0.0021004 6.223 6.52e-10 ***
## children 0.1013204 0.0101304 10.002 < 2e-16 ***
## smokeryes 1.5472965 0.0302910 51.081 < 2e-16 ***
## regionnorthwest -0.0633386 0.0350174 -1.809 0.070712 .
## regionsoutheast -0.1568166 0.0351952 -4.456 9.07e-06 ***
## regionsouthwest -0.1285638 0.0351393 -3.659 0.000263 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4457 on 1330 degrees of freedom
## Multiple R-squared: 0.7663, Adjusted R-squared: 0.765
## F-statistic: 622.9 on 7 and 1330 DF, p-value: < 2.2e-16
plot(f6)
df2<-suppressWarnings(transform(df, SD=apply(df,1, sd, na.rm = TRUE))) # new column sd
head(df2)
f3<-lm(log(charges) ~ age + bmi + children + smoker+region, weights = 1/SD^2,data = df2)
summary(f3)
##
## Call:
## lm(formula = log(charges) ~ age + bmi + children + smoker + region,
## data = df2, weights = 1/SD^2)
##
## Weighted Residuals:
## Min 1Q Median 3Q Max
## -4.276e-04 -3.800e-05 -2.050e-06 5.623e-05 3.691e-04
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.6002913 0.0296889 222.315 < 2e-16 ***
## age 0.0524545 0.0006374 82.288 < 2e-16 ***
## bmi 0.0001969 0.0008373 0.235 0.814
## children 0.1956888 0.0063994 30.579 < 2e-16 ***
## smokeryes 1.5565488 0.0697526 22.315 < 2e-16 ***
## regionnorthwest -0.0834179 0.0167097 -4.992 6.76e-07 ***
## regionsoutheast -0.2893256 0.0158689 -18.232 < 2e-16 ***
## regionsouthwest -0.2598225 0.0157648 -16.481 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 8.933e-05 on 1330 degrees of freedom
## Multiple R-squared: 0.9067, Adjusted R-squared: 0.9063
## F-statistic: 1847 on 7 and 1330 DF, p-value: < 2.2e-16
plot(f3) # increased normality a lot
f7<-lm((charges)^(1/3) ~ age + bmi + children + smoker+region, weights = 1/SD^2,data = df2) # normality issue did not solved by 1/3 power transformation, therefore log transformation is a good chioce. $
summary(f7)
##
## Call:
## lm(formula = (charges)^(1/3) ~ age + bmi + children + smoker +
## region, data = df2, weights = 1/SD^2)
##
## Weighted Residuals:
## Min 1Q Median 3Q Max
## -1.248e-03 -1.457e-04 3.290e-06 2.436e-04 1.994e-03
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.202067 0.134281 53.634 < 2e-16 ***
## age 0.277962 0.002883 96.409 < 2e-16 ***
## bmi 0.004463 0.003787 1.178 0.239
## children 0.891054 0.028944 30.785 < 2e-16 ***
## smokeryes 11.233184 0.315487 35.606 < 2e-16 ***
## regionnorthwest -0.411715 0.075577 -5.448 6.08e-08 ***
## regionsoutheast -1.226795 0.071774 -17.092 < 2e-16 ***
## regionsouthwest -1.140764 0.071303 -15.999 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.000404 on 1330 degrees of freedom
## Multiple R-squared: 0.9287, Adjusted R-squared: 0.9284
## F-statistic: 2476 on 7 and 1330 DF, p-value: < 2.2e-16
plot(f7)
pairs(~charges+age+bmi+ children ,data=df,main="Insurance Scatterplot Matrix")
f4<-lm(log(charges) ~ age + bmi + children + smoker+region+age:children+bmi:smoker+age:bmi+age:smoker+bmi:children+children:smoker, weights = 1/SD^2,data = df2) # add all possible interaction terms
summary(f4)
##
## Call:
## lm(formula = log(charges) ~ age + bmi + children + smoker + region +
## age:children + bmi:smoker + age:bmi + age:smoker + bmi:children +
## children:smoker, data = df2, weights = 1/SD^2)
##
## Weighted Residuals:
## Min 1Q Median 3Q Max
## -3.845e-04 -2.971e-05 -1.600e-06 2.938e-05 3.763e-04
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.473e+00 6.961e-02 92.998 < 2e-16 ***
## age 5.856e-02 3.013e-03 19.432 < 2e-16 ***
## bmi 1.747e-04 2.183e-03 0.080 0.9362
## children 3.662e-01 3.281e-02 11.159 < 2e-16 ***
## smokeryes 1.574e+00 3.493e-01 4.507 7.16e-06 ***
## regionnorthwest -8.749e-02 1.513e-02 -5.782 9.18e-09 ***
## regionsoutheast -2.888e-01 1.437e-02 -20.097 < 2e-16 ***
## regionsouthwest -2.655e-01 1.429e-02 -18.579 < 2e-16 ***
## age:children -7.850e-03 5.669e-04 -13.848 < 2e-16 ***
## bmi:smokeryes 5.534e-02 1.163e-02 4.760 2.15e-06 ***
## age:bmi -1.595e-05 9.477e-05 -0.168 0.8664
## age:smokeryes -4.063e-02 4.978e-03 -8.162 7.59e-16 ***
## bmi:children 1.791e-03 9.861e-04 1.816 0.0696 .
## children:smokeryes -1.115e-01 5.474e-02 -2.036 0.0419 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 8.086e-05 on 1324 degrees of freedom
## Multiple R-squared: 0.9239, Adjusted R-squared: 0.9232
## F-statistic: 1237 on 13 and 1324 DF, p-value: < 2.2e-16
plot(f4)
ols_step_backward_p(f4,details = TRUE) # apply backward elimination again
## Backward Elimination Method
## ---------------------------
##
## Candidate Terms:
##
## 1 . age
## 2 . bmi
## 3 . children
## 4 . smoker
## 5 . region
## 6 . age:children
## 7 . bmi:smoker
## 8 . age:bmi
## 9 . age:smoker
## 10 . bmi:children
## 11 . children:smoker
##
## We are eliminating variables based on p value...
##
## - bmi:children
##
## Backward Elimination: Step 1
##
## Variable bmi:children Removed
##
## Model Summary
## -------------------------------------------------------------
## R 0.912 RMSE 0.380
## R-Squared 0.831 Coef. Var 4.173
## Adj. R-Squared 0.829 MSE 0.144
## Pred R-Squared 0.828 MAE 0.207
## -------------------------------------------------------------
## RMSE: Root Mean Square Error
## MSE: Mean Square Error
## MAE: Mean Absolute Error
##
## ANOVA
## -----------------------------------------------------------------------
## Sum of
## Squares DF Mean Square F Sig.
## -----------------------------------------------------------------------
## Regression 939.444 12 78.287 543.006 0.0000
## Residual 191.030 1325 0.144
## Total 1130.474 1337
## -----------------------------------------------------------------------
##
## Parameter Estimates
## ------------------------------------------------------------------------------------------------
## model Beta Std. Error Std. Beta t Sig lower upper
## ------------------------------------------------------------------------------------------------
## (Intercept) 6.838 0.161 42.517 0.000 6.522 7.153
## age 0.047 0.004 0.721 12.217 0.000 0.040 0.055
## bmi 0.005 0.005 0.031 0.918 0.359 -0.005 0.015
## children 0.282 0.028 0.369 10.176 0.000 0.227 0.336
## smokeryes 1.412 0.145 0.620 9.746 0.000 1.128 1.696
## regionnorthwest -0.057 0.030 -0.027 -1.923 0.055 -0.116 0.001
## regionsoutheast -0.141 0.030 -0.068 -4.694 0.000 -0.200 -0.082
## regionsouthwest -0.148 0.030 -0.069 -4.945 0.000 -0.207 -0.089
## age:children -0.004 0.001 -0.218 -5.907 0.000 -0.005 -0.003
## bmi:smokeryes 0.050 0.004 0.688 12.027 0.000 0.042 0.058
## age:bmi 0.000 0.000 -0.049 -0.702 0.483 0.000 0.000
## age:smokeryes -0.032 0.002 -0.591 -17.416 0.000 -0.036 -0.029
## children:smokeryes -0.123 0.022 -0.093 -5.581 0.000 -0.167 -0.080
## ------------------------------------------------------------------------------------------------
##
##
## - age:bmi
##
## Backward Elimination: Step 2
##
## Variable age:bmi Removed
##
## Model Summary
## -------------------------------------------------------------
## R 0.912 RMSE 0.380
## R-Squared 0.831 Coef. Var 4.172
## Adj. R-Squared 0.830 MSE 0.144
## Pred R-Squared 0.828 MAE 0.208
## -------------------------------------------------------------
## RMSE: Root Mean Square Error
## MSE: Mean Square Error
## MAE: Mean Absolute Error
##
## ANOVA
## -----------------------------------------------------------------------
## Sum of
## Squares DF Mean Square F Sig.
## -----------------------------------------------------------------------
## Regression 939.373 11 85.398 592.552 0.0000
## Residual 191.101 1326 0.144
## Total 1130.474 1337
## -----------------------------------------------------------------------
##
## Parameter Estimates
## ------------------------------------------------------------------------------------------------
## model Beta Std. Error Std. Beta t Sig lower upper
## ------------------------------------------------------------------------------------------------
## (Intercept) 6.939 0.072 96.763 0.000 6.798 7.080
## age 0.045 0.001 0.681 45.649 0.000 0.043 0.047
## bmi 0.001 0.002 0.009 0.696 0.487 -0.003 0.005
## children 0.282 0.028 0.370 10.195 0.000 0.228 0.336
## smokeryes 1.410 0.145 0.619 9.735 0.000 1.126 1.694
## regionnorthwest -0.057 0.030 -0.027 -1.907 0.057 -0.115 0.002
## regionsoutheast -0.140 0.030 -0.068 -4.671 0.000 -0.199 -0.081
## regionsouthwest -0.149 0.030 -0.069 -4.960 0.000 -0.207 -0.090
## age:children -0.004 0.001 -0.219 -5.928 0.000 -0.005 -0.003
## bmi:smokeryes 0.050 0.004 0.690 12.068 0.000 0.042 0.058
## age:smokeryes -0.032 0.002 -0.592 -17.437 0.000 -0.036 -0.029
## children:smokeryes -0.124 0.022 -0.093 -5.585 0.000 -0.167 -0.080
## ------------------------------------------------------------------------------------------------
##
##
##
## No more variables satisfy the condition of p value = 0.3
##
##
## Variables Removed:
##
## - bmi:children
## - age:bmi
##
##
## Final Model Output
## ------------------
##
## Model Summary
## -------------------------------------------------------------
## R 0.912 RMSE 0.380
## R-Squared 0.831 Coef. Var 4.172
## Adj. R-Squared 0.830 MSE 0.144
## Pred R-Squared 0.828 MAE 0.208
## -------------------------------------------------------------
## RMSE: Root Mean Square Error
## MSE: Mean Square Error
## MAE: Mean Absolute Error
##
## ANOVA
## -----------------------------------------------------------------------
## Sum of
## Squares DF Mean Square F Sig.
## -----------------------------------------------------------------------
## Regression 939.373 11 85.398 592.552 0.0000
## Residual 191.101 1326 0.144
## Total 1130.474 1337
## -----------------------------------------------------------------------
##
## Parameter Estimates
## ------------------------------------------------------------------------------------------------
## model Beta Std. Error Std. Beta t Sig lower upper
## ------------------------------------------------------------------------------------------------
## (Intercept) 6.939 0.072 96.763 0.000 6.798 7.080
## age 0.045 0.001 0.681 45.649 0.000 0.043 0.047
## bmi 0.001 0.002 0.009 0.696 0.487 -0.003 0.005
## children 0.282 0.028 0.370 10.195 0.000 0.228 0.336
## smokeryes 1.410 0.145 0.619 9.735 0.000 1.126 1.694
## regionnorthwest -0.057 0.030 -0.027 -1.907 0.057 -0.115 0.002
## regionsoutheast -0.140 0.030 -0.068 -4.671 0.000 -0.199 -0.081
## regionsouthwest -0.149 0.030 -0.069 -4.960 0.000 -0.207 -0.090
## age:children -0.004 0.001 -0.219 -5.928 0.000 -0.005 -0.003
## bmi:smokeryes 0.050 0.004 0.690 12.068 0.000 0.042 0.058
## age:smokeryes -0.032 0.002 -0.592 -17.437 0.000 -0.036 -0.029
## children:smokeryes -0.124 0.022 -0.093 -5.585 0.000 -0.167 -0.080
## ------------------------------------------------------------------------------------------------
##
##
## Elimination Summary
## ---------------------------------------------------------------------------------------
## Variable Adj.
## Step Removed R-Square R-Square C(p) AIC RMSE
## ---------------------------------------------------------------------------------------
## 1 bmi:children 0.831 0.8295 29218352442.9501 1220.6601 0.3797
## 2 age:bmi 0.831 0.8296 29229226142.6719 1219.1580 0.3796
## ---------------------------------------------------------------------------------------
f5<-lm(log(charges) ~ age + bmi + children + smoker+region+age:children+bmi:smoker+age:smoker+children:smoker, weights = 1/SD^2,data = df2)
summary(f5) # final model, the good one
##
## Call:
## lm(formula = log(charges) ~ age + bmi + children + smoker + region +
## age:children + bmi:smoker + age:smoker + children:smoker,
## data = df2, weights = 1/SD^2)
##
## Weighted Residuals:
## Min 1Q Median 3Q Max
## -3.754e-04 -2.966e-05 -1.020e-06 2.947e-05 3.765e-04
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.4627945 0.0286551 225.537 < 2e-16 ***
## age 0.0580826 0.0006916 83.985 < 2e-16 ***
## bmi 0.0005078 0.0007608 0.667 0.5046
## children 0.4170594 0.0170953 24.396 < 2e-16 ***
## smokeryes 1.5416300 0.3487738 4.420 1.07e-05 ***
## regionnorthwest -0.0874500 0.0151389 -5.777 9.49e-09 ***
## regionsoutheast -0.2893022 0.0143764 -20.123 < 2e-16 ***
## regionsouthwest -0.2646088 0.0142848 -18.524 < 2e-16 ***
## age:children -0.0077483 0.0005646 -13.724 < 2e-16 ***
## bmi:smokeryes 0.0566649 0.0115877 4.890 1.13e-06 ***
## age:smokeryes -0.0406418 0.0049718 -8.175 6.87e-16 ***
## children:smokeryes -0.1185118 0.0546348 -2.169 0.0302 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 8.091e-05 on 1326 degrees of freedom
## Multiple R-squared: 0.9237, Adjusted R-squared: 0.9231
## F-statistic: 1460 on 11 and 1326 DF, p-value: < 2.2e-16
residualPlot(f5)
plot(f5)
## Questions and Findings 0. descriptive stat
summary(df)
## age sex bmi children smoker
## Min. :18.00 female:662 Min. :15.96 Min. :0.000 no :1064
## 1st Qu.:27.00 male :676 1st Qu.:26.30 1st Qu.:0.000 yes: 274
## Median :39.00 Median :30.40 Median :1.000
## Mean :39.21 Mean :30.66 Mean :1.095
## 3rd Qu.:51.00 3rd Qu.:34.69 3rd Qu.:2.000
## Max. :64.00 Max. :53.13 Max. :5.000
## region charges
## northeast:324 Min. : 1122
## northwest:325 1st Qu.: 4740
## southeast:364 Median : 9382
## southwest:325 Mean :13270
## 3rd Qu.:16640
## Max. :63770
#head(df2)
df2$sex <- ifelse(df2$sex=="male", 1, 0)
df2$smoker <- ifelse(df2$smoker=="yes", 1, 0)
head(df2)
df3 <- df2[, c(1,2,3,4,5,7)]
head(df3)
cormat <- round(cor(df3),2)
head(cormat)
## age sex bmi children smoker charges
## age 1.00 -0.02 0.11 0.04 -0.03 0.30
## sex -0.02 1.00 0.05 0.02 0.08 0.06
## bmi 0.11 0.05 1.00 0.01 0.00 0.20
## children 0.04 0.02 0.01 1.00 0.01 0.07
## smoker -0.03 0.08 0.00 0.01 1.00 0.79
## charges 0.30 0.06 0.20 0.07 0.79 1.00
melted_cormat <- melt(cormat)
head(melted_cormat)
ggplot(data = melted_cormat, aes(x=Var1, y=Var2, fill=value)) +
geom_tile()
get_lower_tri<-function(cormat){
cormat[upper.tri(cormat)] <- NA
return(cormat)
}
get_upper_tri <- function(cormat){
cormat[lower.tri(cormat)]<- NA
return(cormat)
}
upper_tri <- get_upper_tri(cormat)
upper_tri
## age sex bmi children smoker charges
## age 1 -0.02 0.11 0.04 -0.03 0.30
## sex NA 1.00 0.05 0.02 0.08 0.06
## bmi NA NA 1.00 0.01 0.00 0.20
## children NA NA NA 1.00 0.01 0.07
## smoker NA NA NA NA 1.00 0.79
## charges NA NA NA NA NA 1.00
melted_cormat <- melt(upper_tri, na.rm = TRUE)
ggplot(data = melted_cormat, aes(Var2, Var1, fill = value))+
geom_tile(color = "white")+
scale_fill_gradient2(low = "red", high = "blue", mid = "pink",
midpoint = 0, limit = c(-1,1), space = "Lab",
name="Correlation Heatmap") +
theme_minimal()+
theme(axis.text.x = element_text(angle = 45, vjust = 1,
size = 12, hjust = 1))+
coord_fixed()
boxplot(charges~smoker,
data=df,
main="Charges for Non-Smokers vs Smokers",
xlab="Smoking Habit",
ylab="Medical Insurance Cost",
col="orange",
border="brown"
)
boxplot(charges~region,
data=df,
main="Charges for US Regions",
xlab="US Regions",
ylab="Medical Insurance Cost",
col="orange",
border="brown"
)
confint(f5,"age",level=0.95)
## 2.5 % 97.5 %
## age 0.05672583 0.05943928
with smoking?
exp(1.5416300) # a smoker will spend $4.67 more than a non-smoker.
## [1] 4.6722
with children?
confint(f5,"children",level=0.95) # and yes
## 2.5 % 97.5 %
## children 0.3835226 0.4505962
bmi?
confint(f5,"bmi",level=0.95) #No, not in the final model
## 2.5 % 97.5 %
## bmi -0.0009847388 0.002000344
df4 <- data.frame(smoker = "no", children = 0, age = 22,bmi=17.9,region="northwest")
p1<-predict(f5, df4, interval = 'prediction')
## Warning in predict.lm(f5, df4, interval = "prediction"): Assuming constant prediction variance even though model fit is weighted
p1
## fit lwr upr
## 1 7.66225 7.63622 7.688281
exp(p1)
## fit lwr upr
## 1 2126.537 2071.896 2182.62
not fitted in the same dataframe because I added a new column for the weight but the predictors did not change.
model.sel(f1, f2, f3, f4, f5, f6, f7, rank = AIC)
## Warning in model.sel.default(f1, f2, f3, f4, f5, f6, f7, rank = AIC): response
## differs between models
## Warning in model.sel.default(f1, f2, f3, f4, f5, f6, f7, rank = AIC): models are
## not all fitted to the same data
#parameterEstimates(f5)
anova(f5)
summary(f5)
##
## Call:
## lm(formula = log(charges) ~ age + bmi + children + smoker + region +
## age:children + bmi:smoker + age:smoker + children:smoker,
## data = df2, weights = 1/SD^2)
##
## Weighted Residuals:
## Min 1Q Median 3Q Max
## -3.754e-04 -2.966e-05 -1.020e-06 2.947e-05 3.765e-04
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.4627945 0.0286551 225.537 < 2e-16 ***
## age 0.0580826 0.0006916 83.985 < 2e-16 ***
## bmi 0.0005078 0.0007608 0.667 0.5046
## children 0.4170594 0.0170953 24.396 < 2e-16 ***
## smokeryes 1.5416300 0.3487738 4.420 1.07e-05 ***
## regionnorthwest -0.0874500 0.0151389 -5.777 9.49e-09 ***
## regionsoutheast -0.2893022 0.0143764 -20.123 < 2e-16 ***
## regionsouthwest -0.2646088 0.0142848 -18.524 < 2e-16 ***
## age:children -0.0077483 0.0005646 -13.724 < 2e-16 ***
## bmi:smokeryes 0.0566649 0.0115877 4.890 1.13e-06 ***
## age:smokeryes -0.0406418 0.0049718 -8.175 6.87e-16 ***
## children:smokeryes -0.1185118 0.0546348 -2.169 0.0302 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 8.091e-05 on 1326 degrees of freedom
## Multiple R-squared: 0.9237, Adjusted R-squared: 0.9231
## F-statistic: 1460 on 11 and 1326 DF, p-value: < 2.2e-16